import re
from urllib import parse

import scrapy
from scrapy import Request
from poetry.items import CipaiItem, ArticleItemLoader, PoetryItem, AuthorItem
from scrapy.loader import ItemLoader

base_url = 'https://www.shicimingju.com'
"""
根据作者爬取所有诗词信息
"""
class AuthorSpider(scrapy.Spider):
    name = 'author'
    allowed_domains = ['www.shicimingju.com']
    start_urls = ['https://www.shicimingju.com/category/all']
    # start_urls = ['https://www.shicimingju.com/chaxun/zuozhe/91.html']

    """解析作者列表"""
    def parse(self, response):
        author = response.xpath('//*[@id="main_left"]/div/div[@class="zuozhe_list_item"]/h3/a/text()').extract()
        author_url = response.xpath('//*[@id="main_left"]/div/div[@class="zuozhe_list_item"]/h3/a/@href').extract()


        for post_url in author_url:
            if post_url.startswith('http'):
                yield Request(post_url, callback=self.parse2, meta={"url": post_url})
            else:
                yield Request('https://www.shicimingju.com' + post_url, callback=self.parse2, meta={"url": post_url})

        next_url = response.xpath('//*[@id="list_nav_part"]/a[contains(text(),"下一页")]/@href').extract_first()
        if next_url:
            yield Request(url=parse.urljoin('https://www.shicimingju.com', next_url), callback=self.parse)

    """解析作者诗词列表 诗词url"""
    def parse2(self, response):
        poem_urls = response.xpath('//*[@id="main_left"]//div[@class="shici_list_main"]/h3/a/@href').extract \
            ()
        for post_url in poem_urls:
            if post_url.endswith('html'):
                yield Request('https://www.shicimingju.com' + post_url, callback=self.parse_detail,
                              meta={"url": post_url})
        next_url = response.xpath('//*[@id="list_nav_part"]/a[contains(text(),"下一页")]/@href').extract_first()
        if next_url:
            yield Request(url=parse.urljoin('https://www.shicimingju.com', next_url), callback=self.parse2)
        pass

    def parse_detail(self, response):
        item_loader = ArticleItemLoader(item=PoetryItem(), response=response)
        url = response.meta.get("url", "")
        title = response.xpath('//*[@id="zs_title"]').xpath('string(.)').extract()
        tag = response.xpath('//*[@class="shici-mark"]/a/text()').extract()
        content = response.xpath('//*[@id="zs_content"]').xpath('string(.)').extract()
        annotation = response.xpath('//*[@id="item_shangxi"]').xpath('string(.)').extract()
        images = response.xpath('//*[@id="item_div"]/img/@src').extract_first()

        item_loader.add_value('url', url)
        item_loader.add_value('title', title)
        item_loader.add_xpath('dynasty', '//div[@class="niandai_zuozhe"]/text()')
        item_loader.add_xpath('author', '//div[@class="niandai_zuozhe"]/a/text()')
        item_loader.add_value('content', content)
        item_loader.add_value('annotation', annotation)
        item_loader.add_value('tag', tag)
        item_loader.add_xpath('relation', '//*[@class="shici_list_main"]/h3/a/text()')
        if images:
            item_loader.add_value('images', base_url + images)
        yield item_loader.load_item()
